import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

# ======== 配置部分 ========
# 输入文件路径（假设你已经有专家/公众前50关键词文件）
input_path = r"C:\Users\Lenovo\Desktop\000\价值维度匹配分析\公众偏好关键词.xlsx"
num_clusters = 8  # 你想分成多少个价值维度簇，可以改成5-8
output_path = r"C:\Users\Lenovo\Desktop\000\价值维度匹配分析\聚类_公众偏好关键词.xlsx"
# ==========================

# 1. 读取数据
df = pd.read_excel(input_path)
keywords = df["关键词"].tolist()

# 2. 生成词向量 (使用多语言中文预训练模型)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = model.encode(keywords)

# 3. KMeans 聚类
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init="auto")
labels = kmeans.fit_predict(embeddings)

# 4. 整理聚类结果
df["簇编号"] = labels
clustered = df.groupby("簇编号")["关键词"].apply(list).reset_index()

# 5. 保存结果
clustered.to_excel(output_path, index=False)

print("聚类完成！结果已保存到：", output_path)
